Download package

library(ggplot2)
library(nlme)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following object is masked from 'package:plotly':
## 
##     subplot
## The following objects are masked from 'package:base':
## 
##     format.pval, round.POSIXt, trunc.POSIXt, units
library(gcookbook)
## 
## Attaching package: 'gcookbook'
## The following object is masked from 'package:plotly':
## 
##     wind

DATA

The Milk dataset is load with the nlme package.

The Milk dataset contain 1337 rows of the following 4 columns:

str(Milk)
## Classes 'nfnGroupedData', 'nfGroupedData', 'groupedData' and 'data.frame':   1337 obs. of  4 variables:
##  $ protein: num  3.63 3.57 3.47 3.65 3.89 3.73 3.77 3.9 3.78 3.82 ...
##  $ Time   : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ Cow    : Ord.factor w/ 79 levels "B04"<"B14"<"B03"<..: 25 25 25 25 25 25 25 25 25 25 ...
##  $ Diet   : Factor w/ 3 levels "barley","barley+lupins",..: 1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, "formula")=Class 'formula'  language protein ~ Time | Cow
##   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##  - attr(*, "outer")=Class 'formula'  language ~Diet
##   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##  - attr(*, "labels")=List of 2
##   ..$ x: chr "Time since calving"
##   ..$ y: chr "Protein content of milk sample"
##  - attr(*, "units")=List of 2
##   ..$ x: chr "(weeks)"
##   ..$ y: chr "(%)"
  • protein: numeric , protein content of milk.

  • Time : numeric ,time since calving.

  • Cow : ordered factor, cow id

  • Diet : factor, diet if cow, levels= barley,barley+lupins,lupins.

ggplot

ggplot 是堆疊式的繪圖

  1. 首先給出框架
a <- ggplot(data = Milk, aes(x=Time, y=protein))
a

  1. 利用 geom 增加內容
a+geom_point(aes(color=Diet))

aes()

aes() 可以控制滿多的東西

  • x: x 軸放的變數

  • y: y 軸放得變數

  • color: 依照類別給顏色

  • linetype: 線的樣式(solid, dashed, dotted, etc)

  • shape: 點的形狀

  • size: 物件的大小

  • alpha: 透明度

a+ geom_point(aes(shape=Diet))

a+ geom_point(aes(size=Diet))
## Warning: Using size for a discrete variable is not advised.

a+ geom_point(color="blue", size=5, shape=5)

geom function require only the x

pro <- ggplot(Milk, aes(x=protein))
pro

histogram

pro+geom_histogram(bins=50,color="#33691E",aes(fill=..count..))

pro+geom_histogram(binwidth = 0.1,fill="#33691E")

density and freqpoly()

pro+geom_freqpoly(bins=50)

pro+geom_density(color="#33691E",size=2, linetype=2,fill="blue",alpha=0.3)

bar

ggplot(Milk, aes(x=Diet)) + geom_bar(size=2,color="red",fill="blue",alpha=0.5)+geom_text(stat="count",aes(label=..count..),vjust=-1)+ylim(0,500)

geom that require more than x

p2 <- ggplot(Milk, aes(x=Time, y=protein))

scatter plot

p2+geom_point()

lines

p2+geom_line(aes(group=Cow,color=Diet))

boxplot

p3 <- p2+ geom_boxplot(aes(group=Time)); p3

p3+facet_wrap(~Diet)

### bar

  • 已經算好的用 stat=“identity”
pg_mean
ggplot(pg_mean,aes(x=group,y=weight))+geom_bar(stat="identity")

cabbage_exp
ggplot(cabbage_exp,aes(x=Date,y=Weight,fill=Cultivar))+geom_bar(stat="identity")

ggplot(cabbage_exp,aes(x=Date,y=Weight,fill=Cultivar))+geom_bar(stat="identity",position = "dodge")

## geom_text and geom_label

ggplot(Milk,aes(x=as.factor(Time),fill=Diet))+geom_bar()+geom_text(stat="count",aes(label=..count..), position = position_stack(vjust = 0.5) )

ggplot(cabbage_exp,aes(x=Date,y=Weight,fill=Cultivar))+geom_bar(stat="identity")+geom_text( aes(label=Weight), position = position_stack(vjust = 0.5) )

ggplot(cabbage_exp,aes(x=Date,y=Weight,fill=Cultivar))+geom_bar(stat="identity",position = "dodge")+geom_text(aes(label=Weight),position=position_dodge(0.9),vjust=1.5)

ggplot(iris,aes(x=Sepal.Length ,y=Sepal.Width,label=Species ))+geom_text()

ggplot(iris,aes(x=Sepal.Length ,y=Sepal.Width,label=Species,color=Species ))+geom_text(check_overlap = TRUE)

ggplot(iris,aes(x=Sepal.Length ,y=Sepal.Width,label=Species ))+geom_label()

stat_summmary

  • mean_cl_boot: mean and bootstrapped confidence interval (default 95%)

  • mean_cl_normal: mean and t-distribution based confidence interval(defult 95%)

  • mean_sdl: mean plus or minus standard deviation times some constant(default constant=2)

  • median_hilow: median abd outer quantiles (default outer quantiles=0.025 and 0.975)

*mean_se: mean \(\pm\) se, 內定。

ggplot(Milk, aes(x=Time, y=protein)) +  stat_summary(fun.data="mean_cl_boot") +ggtitle("mean_cl_boot")

ggplot(Milk, aes(x=Time, y=protein)) +  stat_summary(fun.data="mean_cl_normal")+ggtitle("mean_cl_normal")

ggplot(Milk, aes(x=Time, y=protein)) +  stat_summary(fun.data="mean_se")+ggtitle("mean_se")


ggplot(Milk, aes(x=Time, y=protein)) + stat_summary(fun.y="mean", geom="point")

meanlog <- function(y) {mean(log(y))}
ggplot(Milk, aes(x=Time, y=protein)) + stat_summary(fun.y="meanlog", geom="line",col="red",size=3)

Faceting

facet_wrap()

依據類別分成個小個區塊

ggplot(Milk, aes(x=protein, color=Diet)) + geom_density() 

ggplot(Milk, aes(x=protein, color=Diet)) + geom_density() + facet_wrap(~Time)

facet_grid

ggplot(Milk, aes(x=Time, y=protein)) + geom_point()

ggplot(Milk, aes(x=Time, y=protein)) + geom_point() + facet_grid(Diet~.)

ggplot(Milk, aes(x=Time, y=protein)) + geom_point() + facet_grid(Diet~.) + theme(strip.text.y = element_text(angle = 0))

ggplot(Milk, aes(x=Time, y=protein)) + geom_point() + facet_grid(Diet~.) + theme(axis.title.y = element_text(angle = 0,vjust=0.5))

Themes

Themes 控制背景樣式,例如:

  • background color
  • size of fonts
  • gridline (網格線)
  • color of lable

控制的元素為

  • element_line: can specify color, size, linetype, etc.
  • element_rect: can specify fill, color, size, etc.
  • element_text: can specify family,size,color, etc.
  • element_blank: remove theme elements from graph.

plot elements

pt <- ggplot(Milk, aes(x=Time, y=protein)) + geom_point()
pt + theme(plot.background = element_rect(fill="lightgreen")) 

Axis elements

pt + theme(axis.text.x=element_text(color="blue",size=15))

pt + theme(axis.title.y=element_text(color="red",size=15,angle = 0, vjust = 0.5))

df <- data.frame(
  x=c("aaaaaaaaaaaaaaaa","aaaaaaaaaaaaaaaaaaaaaaaaaaaa","aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"),
  y=1:3
)

base <- ggplot(df,aes(x=x,y=y))+geom_point(); base

base+theme(axis.text.x=element_text(angle = -30,hjust=0))

legend elements

base <- ggplot(data = Milk, aes(x=Time, y=protein, color=Diet))+geom_point()
base + theme(legend.background = element_rect(fill="lightblue",color="black",size=2))

base + theme(legend.key = element_rect(color="green"),legend.key.size =unit(1,"cm"))

base + theme(legend.text = element_text(size=15,face="bold"))

### panel elements

base

base + theme(panel.background = element_rect(fill="lightgreen",color="blue"))

base + theme(panel.grid.major = element_line(color="blue",size=5))

base + theme(aspect.ratio = 2); base + theme(aspect.ratio = 1/2)

### facetting elements

base <- ggplot(Milk, aes(x=Time, y=protein)) + geom_point() + facet_grid(Diet~.)
base + theme(strip.background = element_rect(fill="red"),strip.text = element_text(face="bold",size=15))

base +theme(panel.margin=unit(3,"cm"))
## Warning: `panel.margin` is deprecated. Please use `panel.spacing` property
## instead

scale

scale_x_continuous and scale_y_continuous

p4 <- ggplot(Milk, aes(x=Time, y=protein))+geom_point()
p4+scale_x_continuous("時間",   breaks = c(5, 10, 15),label = c("five", "ten", "fifteen"))

df <- data.frame(
  x = rnorm(10) * 100000,
  y = seq(0, 1, length.out = 10)
)
p2 <- ggplot(df, aes(x, y)) + geom_point()
p2 + scale_y_continuous(labels = scales::percent)

p2 + scale_y_continuous(labels = scales::dollar)

p2 + scale_y_continuous(labels = scales::comma)

p2 + scale_y_continuous(labels = scales::scientific)

scale_color

df <- data.frame(
  x = runif(100),
  y = runif(100),
  z2 = abs(rnorm(100))
)
ggplot(df, aes(x, y)) +
  geom_point(aes(colour = z2)) +
  scale_colour_gradient(low = "white", high = "black")

dDiet <- ggplot(Milk, aes(x=protein, fill=Diet)) + geom_density(alpha=1/3)
dDiet + scale_fill_hue()  #主要分辨因子,以此顏色差距大

dDiet + scale_fill_brewer()  #連續的

Example

DATA

The data consist of 60 observations and 3 variables;

  • len : numeric, tooth (odontoblast, actually) length
  • supp: factor, supplement type, 2 levels, “vc” ascirbic acid, and “OJ” is orange juice
  • does: numeric, does(mg/day)
ToothGrowth
str(ToothGrowth)
## 'data.frame':    60 obs. of  3 variables:
##  $ len : num  4.2 11.5 7.3 5.8 6.4 10 11.2 11.2 5.2 7 ...
##  $ supp: Factor w/ 2 levels "OJ","VC": 2 2 2 2 2 2 2 2 2 2 ...
##  $ dose: num  0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...

Exploring distributions

ggplot(ToothGrowth, aes(x=as.factor(dose), fill=supp)) + geom_bar()+geom_text(stat="count",  aes(label=..count..), position = position_stack(vjust = 0.5))

ggplot(ToothGrowth,aes(x=len))+geom_density()

ggplot(ToothGrowth,aes(x=len))+geom_density(aes(color=supp))

tp <- ggplot(ToothGrowth, aes(x=dose,y=len))

tp + geom_point()

tp + stat_summary(fun.data = "mean_cl_normal")+stat_summary(fun.y="mean", geom="line")

tp + stat_summary(fun.data = "mean_cl_normal")+stat_summary(fun.y="mean", geom="line")+aes(color=supp)

fit model

ToothGrowth$dose2 <- ToothGrowth$dose^2

lm2 <- lm(len ~ dose+dose2*supp,data=ToothGrowth)

summary(lm2)
## 
## Call:
## lm(formula = len ~ dose + dose2 * supp, data = ToothGrowth)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.6804 -2.4471 -0.4002  2.8956  7.8639 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    0.7492     2.7984   0.268  0.78992    
## dose          30.1550     5.2475   5.747 4.11e-07 ***
## dose2         -8.7238     2.0403  -4.276 7.64e-05 ***
## suppVC        -6.4783     1.3762  -4.707 1.74e-05 ***
## dose2:suppVC   1.5876     0.5771   2.751  0.00802 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.621 on 55 degrees of freedom
## Multiple R-squared:  0.7911, Adjusted R-squared:  0.7759 
## F-statistic: 52.07 on 4 and 55 DF,  p-value: < 2.2e-16

fortify

fortify 可以讓我們在做模型驗證變得簡單

  • hat: leverages(influence)
  • sigma: residual standard deviation when observation dropped from model
  • cooksd: Cook’s distance
  • fitted: fitted (predicted) values
  • resid: residuals
  • stdresid: standardized residuals
lm2sum <- fortify(lm2); lm2sum
ggplot(lm2sum, aes(sample=.stdresid)) + stat_qq() + geom_abline()

newdata <- data.frame(dose=rep(seq(0.5, 2, .01),each=2), 
                      supp=factor(c("OJ", "VC")))

newdata$dose2 <- newdata$dose^2

newdata <- data.frame(newdata, predict(lm2, newdata, interval="confidence")); newdata
p1 <- ggplot(newdata, aes(x=dose, y=fit, color=supp)) + geom_line(); p1

p2 <- p1 + geom_ribbon(aes(ymax=upr, ymin=lwr, fill=supp),alpha=0.5); p2

p2+geom_point(data=ToothGrowth, aes(y=len))

plotly

可以輸出簡易互動式圖表

ggplotly()

ggplotly(p2)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
p3 <- ggplot(Milk, aes(x=Time, y=protein)) + geom_boxplot(aes(group=Time)) + facet_wrap(~Diet); ggplotly(p3)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
p4 <- ggplot(Milk,aes(x=as.factor(Time),fill=Diet))+geom_bar(); ggplotly(p4)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`